GitHub repository : 605project
Selected dataset : A dataset about stock price from Kaggle
The whole dataset is around 13 GB, and 100 stocks (Nifty 100 stocks) and 2 indices (Nifty 50 and Nifty Bank indices) are present in this dataset. Data for each stock is in a separate csv file. This dataset is not only in line with our interests, but also suitable for parallelization computation.
Since the structures of the 100 csv files are almost same. Here we just select one stock (ACC) for example.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#sns.set_style('whitegrid')
#plt.style.use('fivethirtyeight')
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode
#init_notebook_mode(connected = True)
#%matplotlib inline
from pandas_datareader.data import DataReader
#import yfinance as yf
from datetime import datetime
filename='D:/文件/学习资料/22fall/STAT 605/605project/ACC_with_indicators_.csv'
df=pd.read_csv(filename)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 130348 entries, 0 to 130347 Data columns (total 59 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 130348 non-null object 1 open 130348 non-null float64 2 high 130348 non-null float64 3 low 130348 non-null float64 4 close 130348 non-null float64 5 volume 130348 non-null int64 6 sma5 130348 non-null float64 7 sma10 130348 non-null float64 8 sma15 130348 non-null float64 9 sma20 130348 non-null float64 10 ema5 130348 non-null float64 11 ema10 130348 non-null float64 12 ema15 130348 non-null float64 13 ema20 130348 non-null float64 14 upperband 130348 non-null float64 15 middleband 130348 non-null float64 16 lowerband 130348 non-null float64 17 HT_TRENDLINE 130348 non-null float64 18 KAMA10 130348 non-null float64 19 KAMA20 130348 non-null float64 20 KAMA30 130348 non-null float64 21 SAR 130348 non-null float64 22 TRIMA5 130348 non-null float64 23 TRIMA10 130348 non-null float64 24 TRIMA20 130348 non-null float64 25 ADX5 130348 non-null float64 26 ADX10 130348 non-null float64 27 ADX20 130348 non-null float64 28 APO 130348 non-null float64 29 CCI5 130348 non-null float64 30 CCI10 130348 non-null float64 31 CCI15 130348 non-null float64 32 macd510 130348 non-null float64 33 macd520 130348 non-null float64 34 macd1020 130348 non-null float64 35 macd1520 130348 non-null float64 36 macd1226 130348 non-null float64 37 MOM10 130348 non-null float64 38 MOM15 130348 non-null float64 39 MOM20 130348 non-null float64 40 ROC5 130348 non-null float64 41 ROC10 130348 non-null float64 42 ROC20 130348 non-null float64 43 PPO 130348 non-null float64 44 RSI14 130348 non-null float64 45 RSI8 130348 non-null float64 46 slowk 130348 non-null float64 47 slowd 130348 non-null float64 48 fastk 130348 non-null float64 49 fastd 130348 non-null float64 50 fastksr 130348 non-null float64 51 fastdsr 130348 non-null float64 52 ULTOSC 130348 non-null float64 53 WILLR 130348 non-null float64 54 ATR 130348 non-null float64 55 Trange 130348 non-null float64 56 TYPPRICE 130348 non-null float64 57 HT_DCPERIOD 130348 non-null float64 58 BETA 130348 non-null float64 dtypes: float64(57), int64(1), object(1) memory usage: 58.7+ MB
df.describe()
| open | high | low | close | volume | sma5 | sma10 | sma15 | sma20 | ema5 | ... | fastd | fastksr | fastdsr | ULTOSC | WILLR | ATR | Trange | TYPPRICE | HT_DCPERIOD | BETA | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 130348.000000 | 130348.000000 | 130348.000000 | 130348.000000 | 1.303480e+05 | 130348.000000 | 130348.000000 | 130348.000000 | 130348.000000 | 130348.000000 | ... | 1.303480e+05 | 130348.000000 | 1.303480e+05 | 130348.000000 | 130348.000000 | 130348.000000 | 130348.000000 | 130348.000000 | 130348.000000 | 130348.000000 |
| mean | 1593.299361 | 1595.305878 | 1591.248035 | 1593.273804 | 8.308222e+03 | 1593.263916 | 1593.251734 | 1593.239523 | 1593.227221 | 1593.263943 | ... | 4.927998e+01 | 49.033892 | 4.903338e+01 | 49.719530 | -50.758167 | 4.113487 | 4.113410 | 1593.275906 | 21.536557 | 0.478511 |
| std | 285.275439 | 285.529564 | 285.010866 | 285.268984 | 1.821215e+04 | 285.244782 | 285.218411 | 285.192542 | 285.166763 | 285.237925 | ... | 2.385871e+01 | 41.123551 | 3.170282e+01 | 10.421884 | 28.414086 | 1.707523 | 3.361245 | 285.266169 | 5.108991 | 0.818136 |
| min | 901.000000 | 904.450000 | 895.150000 | 900.350000 | 0.000000e+00 | 904.580000 | 905.550000 | 906.456667 | 906.995000 | 904.462823 | ... | -1.431744e-12 | 0.000000 | -6.489624e-13 | 10.410101 | -100.000000 | 0.856757 | 0.000000 | 899.983333 | 9.291003 | -29.574415 |
| 25% | 1405.150000 | 1407.000000 | 1403.587500 | 1405.150000 | 1.855000e+03 | 1405.390000 | 1405.408750 | 1405.440000 | 1405.382500 | 1405.382110 | ... | 2.938827e+01 | 0.000000 | 2.117341e+01 | 42.545650 | -75.294118 | 2.967718 | 2.150000 | 1405.250000 | 17.818637 | 0.108752 |
| 50% | 1541.400000 | 1543.000000 | 1539.925000 | 1541.400000 | 4.118000e+03 | 1541.360000 | 1541.355000 | 1541.396667 | 1541.452500 | 1541.313757 | ... | 4.889764e+01 | 47.566666 | 4.840934e+01 | 49.751144 | -51.282051 | 3.780696 | 3.350000 | 1541.366667 | 20.625720 | 0.424162 |
| 75% | 1690.412500 | 1692.550000 | 1688.700000 | 1690.350000 | 9.093000e+03 | 1690.382500 | 1690.375000 | 1690.397500 | 1690.350625 | 1690.357473 | ... | 6.898565e+01 | 100.000000 | 7.631643e+01 | 56.931925 | -26.209421 | 4.858908 | 5.000000 | 1690.466667 | 24.335306 | 0.783740 |
| max | 2574.950000 | 2587.800000 | 2572.450000 | 2575.550000 | 1.396017e+06 | 2572.650000 | 2572.290000 | 2572.053333 | 2571.707500 | 2572.561341 | ... | 1.000000e+02 | 100.000000 | 1.000000e+02 | 86.826440 | -0.000000 | 25.653432 | 160.150000 | 2577.116667 | 46.597027 | 38.173669 |
8 rows × 58 columns
df.head()
| date | open | high | low | close | volume | sma5 | sma10 | sma15 | sma20 | ... | fastd | fastksr | fastdsr | ULTOSC | WILLR | ATR | Trange | TYPPRICE | HT_DCPERIOD | BETA | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015-02-02 14:30:00+05:30 | 1528.50 | 1529.95 | 1526.05 | 1527.40 | 4678 | 1538.82 | 1543.015 | 1542.016667 | 1539.8375 | ... | 4.838951 | 0.0 | 0.000000 | 43.346867 | -95.063985 | 5.282946 | 3.90 | 1527.800000 | 25.928999 | 0.479466 |
| 1 | 2015-02-02 14:35:00+05:30 | 1527.40 | 1528.00 | 1516.00 | 1521.95 | 10165 | 1532.81 | 1540.670 | 1541.213333 | 1539.2850 | ... | 7.147969 | 0.0 | 0.000000 | 41.448445 | -84.090909 | 5.762736 | 12.00 | 1521.983333 | 25.595475 | 0.200019 |
| 2 | 2015-02-02 14:40:00+05:30 | 1521.30 | 1526.70 | 1521.00 | 1521.55 | 8078 | 1527.52 | 1538.205 | 1540.316667 | 1538.7225 | ... | 12.588612 | 0.0 | 0.000000 | 36.648343 | -85.160428 | 5.758254 | 5.70 | 1523.083333 | 25.184555 | 0.450949 |
| 3 | 2015-02-02 14:45:00+05:30 | 1520.65 | 1522.90 | 1519.80 | 1520.25 | 4733 | 1523.93 | 1535.725 | 1538.996667 | 1538.1250 | ... | 17.267679 | 0.0 | 0.000000 | 30.139572 | -88.636364 | 5.568379 | 3.10 | 1520.983333 | 25.349728 | 0.560333 |
| 4 | 2015-02-02 14:50:00+05:30 | 1521.20 | 1526.10 | 1516.25 | 1526.10 | 4636 | 1523.45 | 1533.440 | 1537.406667 | 1537.6800 | ... | 36.098460 | 100.0 | 33.333333 | 41.145881 | -72.994652 | 5.874209 | 9.85 | 1522.816667 | 26.308002 | -0.058313 |
5 rows × 59 columns
fig = px.area(df, x = df.date, y = 'close')
# template = 'plotly_dark'
fig.show()
fig = px.area(df, x = df.date, y = 'volume')
# template = 'plotly_dark'
fig.show()
df['Daily Return'] = df['close'].pct_change()
fig = px.area(df, x = df.date, y = 'Daily Return')
# template = 'plotly_dark'
fig.show()
fig = px.histogram(df['Daily Return'])
fig.show()
df['Daily Return'].sum()
0.6636422462653006
# fig = go.Figure(data=[go.Candlestick(x=df.date,
# open=df['open'],
# high=df['high'],
# low=df['low'],
# close=df['close']
# )])
# fig.show()
data = df.filter(['close'])
dataset = data.values
training_data_len = int(np.ceil( len(dataset) * .95 ))
training_data_len
123831
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(dataset)
scaled_data
array([[0.37431351],
[0.37106017],
[0.37082139],
...,
[0.76441619],
[0.76390879],
[0.76668457]])
train_data = scaled_data[0:int(training_data_len), :]
x_train = []
y_train = []
for i in range(60, len(train_data)):
x_train.append(train_data[i-60:i, 0])
y_train.append(train_data[i, 0])
# if i<= 61:
# print(x_train)
# print(y_train)
# print()
x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
from keras.models import Sequential
from keras.layers import Dense, LSTM
# C:\Users\10985\AppData\Roaming\Python\Python38\Scripts
# Build the LSTM model
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape= (x_train.shape[1], 1)))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
# Train the model
model.fit(x_train, y_train, batch_size=1, epochs=1)
123771/123771 [==============================] - 2445s 20ms/step - loss: 5.3926e-05
<keras.callbacks.History at 0x275bf6520d0>
# Create the testing data set
# Create a new array containing scaled values from index 1543 to 2002
test_data = scaled_data[training_data_len - 60: , :]
# Create the data sets x_test and y_test
x_test = []
y_test = dataset[training_data_len:, :]
for i in range(60, len(test_data)):
x_test.append(test_data[i-60:i, 0])
# Convert the data to a numpy array
x_test = np.array(x_test)
# Reshape the data
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1 ))
# Get the models predicted price values
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
# Get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(((predictions - y_test) ** 2)))
rmse
204/204 [==============================] - 5s 18ms/step
12.636955626772885
train = data[:training_data_len]
valid = data[training_data_len:]
valid['Predictions'] = predictions
<ipython-input-13-8f78faa8f9ca>:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
plt.figure(figsize=(16,6))
plt.title('Model')
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price', fontsize=18)
plt.plot(train['close'])
plt.plot(valid[['close', 'Predictions']])
plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
plt.show()
valid
| close | Predictions | |
|---|---|---|
| 123831 | 2333.70 | 2351.604248 |
| 123832 | 2335.20 | 2346.500244 |
| 123833 | 2333.40 | 2348.466309 |
| 123834 | 2329.90 | 2346.479248 |
| 123835 | 2335.20 | 2343.041992 |
| ... | ... | ... |
| 130343 | 2181.10 | 2196.764893 |
| 130344 | 2180.70 | 2192.002197 |
| 130345 | 2180.90 | 2191.606201 |
| 130346 | 2180.05 | 2191.733398 |
| 130347 | 2184.70 | 2190.779785 |
6517 rows × 2 columns